{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 提取特征值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import ijson\n",
    "import os\n",
    "import csv\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_dir = \"json_data\"\n",
    "output_file = \"dataset.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_sni(layers):\n",
    "    if (\n",
    "        \"tls\" not in layers\n",
    "        or \"tls.record\" not in layers[\"tls\"]\n",
    "        or \"tls.handshake\" not in layers[\"tls\"][\"tls.record\"]\n",
    "    ):\n",
    "        return None\n",
    "\n",
    "    handshake = layers[\"tls\"][\"tls.record\"][\"tls.handshake\"]\n",
    "    if not isinstance(handshake, dict):\n",
    "        return None\n",
    "\n",
    "    for extension_key, extension_value in handshake.items():\n",
    "        if (\n",
    "            extension_key.startswith(\"Extension: server_name\")\n",
    "            and \"Server Name Indication extension\" in extension_value\n",
    "        ):\n",
    "            return extension_value[\"Server Name Indication extension\"][\n",
    "                \"tls.handshake.extensions_server_name\"\n",
    "            ]\n",
    "\n",
    "    return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 获取所有 JSON 文件的列表\n",
    "json_files = [\n",
    "    os.path.join(input_dir, file)\n",
    "    for file in os.listdir(input_dir)\n",
    "    if file.endswith(\".json\")\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 写入CSV文件头\n",
    "with open(output_file, \"w\", newline=\"\", encoding=\"utf-8-sig\") as csvfile:\n",
    "    fieldnames = [\"classes\", \"filename\", \"stream_index\", \"sni\"] + [\n",
    "        f\"len{i+1}\" for i in range(30)\n",
    "    ]\n",
    "    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
    "    writer.writeheader()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing www.news.cn.json:   0%|          | 0.00/6.25G [00:00<?, ?B/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing www.news.cn.json: 100%|██████████| 6.25G/6.25G [00:26<00:00, 236MB/s]\n",
      "Processing www.qimao.com.json: 100%|██████████| 132M/132M [00:00<00:00, 192MB/s] \n",
      "Processing dxy.com.json: 100%|██████████| 58.8M/58.8M [00:00<00:00, 165MB/s]\n",
      "Processing bbs.elecfans.com.json: 100%|██████████| 35.6M/35.6M [00:00<00:00, 210MB/s]\n",
      "Processing www.7k7k.com.json: 100%|██████████| 1.48G/1.48G [00:06<00:00, 236MB/s]\n",
      "Processing bbs.kanxue.com.json: 100%|██████████| 9.28G/9.28G [00:42<00:00, 218MB/s]\n",
      "Processing my.4399.com.json: 100%|██████████| 2.60G/2.60G [00:10<00:00, 254MB/s]\n",
      "Processing www.haodf.com.json: 100%|██████████| 83.8M/83.8M [00:00<00:00, 161MB/s]\n",
      "Processing blog.csdn.net.json: 100%|██████████| 157M/157M [00:00<00:00, 176MB/s] \n",
      "Processing www.3dmgame.com.json: 100%|██████████| 570k/570k [00:00<00:00, 94.9MB/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CSV file created successfully.\n"
     ]
    }
   ],
   "source": [
    "# 处理每个 JSON 文件\n",
    "for json_file in json_files:\n",
    "    file_size = os.path.getsize(json_file)\n",
    "    tls_packets = []\n",
    "    sni_dict = {}\n",
    "    tcp_len_dict = {}\n",
    "\n",
    "    with open(json_file, \"r\", encoding=\"utf-8\") as f:\n",
    "        objects = ijson.items(f, \"item\")\n",
    "        with tqdm(\n",
    "            total=file_size,\n",
    "            desc=f\"Processing {os.path.basename(json_file)}\",\n",
    "            unit=\"B\",\n",
    "            unit_scale=True,\n",
    "        ) as pbar:\n",
    "            for packet in objects:\n",
    "                pbar.update(f.tell() - pbar.n)\n",
    "                layers = packet.get(\"_source\", {}).get(\"layers\", {})\n",
    "                if not layers or \"tcp\" not in layers:\n",
    "                    continue\n",
    "\n",
    "                stream_index = layers[\"tcp\"].get(\"tcp.stream\")\n",
    "                if stream_index is None:\n",
    "                    continue\n",
    "\n",
    "                if \"tls\" in layers:\n",
    "                    sni = extract_sni(layers)\n",
    "                    if sni:\n",
    "                        sni_dict.setdefault(stream_index, set()).add(sni)\n",
    "\n",
    "                tcp_len = int(layers[\"tcp\"].get(\"tcp.len\", 0))\n",
    "                tcp_len_dict.setdefault(stream_index, []).append(tcp_len)\n",
    "\n",
    "        # 每次处理完一个文件后，写入并清空内容\n",
    "        with open(output_file, \"a\", newline=\"\", encoding=\"utf-8-sig\") as csvfile:\n",
    "            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n",
    "            for stream_index, sni_set in sni_dict.items():\n",
    "                tcp_lengths = tcp_len_dict.get(stream_index, [])\n",
    "                # 限制为 30 个元素，少于 30 个则用 0 填充\n",
    "                tcp_lengths = (tcp_lengths[:30] + [0] * 30)[:30]\n",
    "                row = {\n",
    "                    \"classes\": os.path.splitext(os.path.basename(json_file))[0],\n",
    "                    \"filename\": os.path.basename(json_file),\n",
    "                    \"stream_index\": stream_index,\n",
    "                    \"sni\": \", \".join(sni_set),\n",
    "                }\n",
    "                row.update(\n",
    "                    {f\"len{i+1}\": length for i, length in enumerate(tcp_lengths)}\n",
    "                )\n",
    "                writer.writerow(row)\n",
    "\n",
    "        # 清空字典\n",
    "        tls_packets.clear()\n",
    "        sni_dict.clear()\n",
    "        tcp_len_dict.clear()\n",
    "\n",
    "print(\"CSV file created successfully.\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
